################################################################################
#                                                                              #
# TITLE        : BREAKING SILENCE: HOW INTIMATE PARTNER VIOLENCE AND           #         
#                REPORTING SHAPE LATER LIFE OUTCOMES                           #        
# CODE EDITOR  : Harrison Chang                                                #
# LAST MODIFIED: 2025/07/18                                                    #
# PURPOSE      : This file conducts main regression analysis                   #
#                                                                              #
#                                                                              #
#                                                                              #
#                                                                              #
################################################################################


rm(list=ls());gc()

library(data.table)
library(arrow)
library(stringr)
library(tidyverse)
library(dplyr)

Disk                  <- "E"

source(paste0(Disk, ":/H112057/arrow/arrow_20231023/arrow_helpers.R"))
source(paste0(Disk, ":/H112057/H112057_Harrison/DV/code/graph.R"))
source(paste0(Disk, ":/H112057/H112057_Harrison/DV/code/eventcode_IV.R"))



###################################################
#                       Setting                   #
###################################################

setwd(paste0(Disk, ":/H112057/H112057_Harrison/DV"))

output.dir            <- paste0(Disk, ":/H112057/H112057_Harrison/DV/processed_data")
par.dir               <- paste0(Disk, ":/H112057/parquet")
csv.dir               <- paste0(Disk, ":/H112057/data")
result.dir            <- paste0(Disk, ":/H112057/H112057_Harrison/DV/results")
CleanData.dir         <- paste0(Disk, ":/H112057/CleanData/processed_data")

gc()



###################################################
#                   Global Setting                #
###################################################

output.path           <- c("export/")

sample_year           <- c(2012:2018)
data_year             <- c(2004:2018)

birthyear             <- TRUE
education             <- TRUE

perp_edu_group        <- FALSE
perp_income_group     <- FALSE

isopen                <- FALSE
type_physical         <- FALSE
duration_year         <- FALSE
infounit              <- FALSE

stratify_or_not       <- FALSE
stratify_variable     <- c("edu_group")

sample_selection      <- c("IPVf_once")


violence_sample       <- FALSE
violence_effect       <- FALSE
decompose_duration    <- FALSE
year_before_report    <- 2

bacon                 <- FALSE
young                 <- FALSE
NoBirth               <- FALSE
static_reg            <- FALSE


control_variable      <- c()
if ( birthyear == TRUE ){
  control_variable    <- rbind(control_variable,"birth_year")
}
if ( education == TRUE ){
  control_variable    <- rbind(control_variable,"max_edu_year")
}
if ( type_physical == TRUE ){
  control_variable    <- rbind(control_variable,"type_physical")
}
if ( isopen == TRUE ){
  control_variable    <- rbind(control_variable,"isopen")
}
if ( duration_year == TRUE ){
  control_variable    <- rbind(control_variable,"duration_year")
}
if ( perp_income_group == TRUE ){
  control_variable    <- rbind(control_variable,"perp_income_group")
}
if ( perp_edu_group == TRUE ){
  control_variable    <- rbind(control_variable,"perp_edu_group")
}
if ( infounit == TRUE ){
  control_variable    <- rbind(control_variable,"infounit")
}







###################################################
#                       Data                      #
###################################################

data                  <- open_dataset(paste0(output.dir, "/data/IPV_victim_general_panel.parquet")) %>% 
                         collect() %>% 
                         as.data.table()


data                  <- data[complete_marr_dummy == 1]


dt_sample             <- data[event_year %in% sample_year][year %in% data_year]

rm(data);gc()

dt_sample[, ID_no:= rleid(ID)]


if ( sample_selection == "IPVf_once"){
  dt_sample           <- dt_sample[id_sex== 0 & repeated == 0]
} else if (sample_selection == "IPVf_repeat"){
  dt_sample           <- dt_sample[id_sex== 0 & repeated == 1]
} else if (sample_selection == "IPVm_once"){
  dt_sample           <- dt_sample[id_sex== 1 & repeated == 0]
}



dt_sample[, birth_year   := as.integer(ID_BIRTH_Y)] 
dt_sample[, max_edu_year := as.integer(highest_edu)] 

dt_sample[, high_edu := ifelse(max_edu_year>=12,1,0)]
dt_sample[max_edu_year>=0, .N, .(high_edu)][order(high_edu)]

dt_sample[, vic_age := event_year - birth_year]
dt_sample[vic_age %in% c(25:35), age_group := 1]
dt_sample[vic_age %in% c(36:45), age_group := 2]
dt_sample[vic_age %in% c(46:55), age_group := 3]

if ( young == TRUE){
  dt_sample       <- dt_sample[age_group==1]
  
}


dt_sample[, AlwaysSingle   := ifelse(all(MARR==1),1,0), by = "ID"]
dt_sample[, AlwaysMarried  := ifelse(all(MARR==2),1,0), by = "ID"]
dt_sample[, AlwaysDivorced := ifelse(all(MARR==3),1,0), by = "ID"]
dt_sample[, EverDivorced   := ifelse(any(MARR==3),1,0), by = "ID"]

dt_sample[, relative_year  := year - event_year]
dt_sample[, PostDivorced   := ifelse(relative_year==0 & get_divorced==1|
                                       relative_year==1 & get_divorced==1|
                                       relative_year==2 & get_divorced==1|
                                       relative_year==3 & get_divorced==1|
                                       relative_year==4 & get_divorced==1,1,0), by = "ID"]

dt_sample[, Post01234      := ifelse(any(PostDivorced==1),1,0), by = "ID"]


unique(dt_sample, by = "ID")[, .N, .(AlwaysSingle)]
unique(dt_sample, by = "ID")[, .N, .(AlwaysMarried)]
unique(dt_sample, by = "ID")[, .N, .(AlwaysDivorced)]
unique(dt_sample, by = "ID")[, .N, .(EverDivorced)]
unique(dt_sample, by = "ID")[, .N, .(Post01234)]


dt_sample[AlwaysSingle==1              , marr_status := 1]
dt_sample[AlwaysMarried==1             , marr_status := 2]
dt_sample[AlwaysDivorced==1            , marr_status := 3]
dt_sample[is.na(marr_status)==1        , marr_status := 4]
dt_sample[Post01234==1 & marr_status==4, marr_status := 5]

unique(dt_sample, by = "ID")[, .N, .(marr_status)][order(marr_status)]

dt_sample[, PostBirth      := ifelse(relative_year==0 & year_give_birth==1|
                                       relative_year==1 & year_give_birth==1|
                                       relative_year==2 & year_give_birth==1|
                                       relative_year==3 & year_give_birth==1|
                                       relative_year==4 & year_give_birth==1,1,0), by = "ID"]


dt_sample[, Birth01234      := ifelse(any(PostBirth==1),1,0), by = "ID"]

unique(dt_sample, by = "ID")[, .N, .(Birth01234)]

if (NoBirth == TRUE){
 dt_sample <- dt_sample[Birth01234==0]
}


###################################################
#     Report & Violence Effect Decomposition      #
###################################################

dt_sample[, calendarYr := year]
dt_sample[, dvYr       := event_year]
dt_sample[, onset_time := min(sample_year) -1]
dt_sample[!duplicated(ID)] %>% nrow()

if (violence_sample == TRUE){
  violence_time       <- open_dataset(paste0(output.dir,"/data/2012_2018/IPV_violence_year.parquet")) %>% 
                         collect() %>% 
                         as.data.table()
  
  violence_time[, .N, .(duration_year)][order(duration_year)]
  violence_time[, violence_year := report_year - duration_year]
  
  dt_sample       <- violence_time[dt_sample, on = "ID", nomatch = NULL]
  dt_sample[, .N, .(duration_year)][order(duration_year)]
  dt_sample       <- dt_sample[duration_year!=0 & duration_year!=10]
  
  nrow(dt_sample[!duplicated(ID)])
}

if (violence_effect == TRUE){
  
  dt_sample[, calendarYr := year][, dvYr := violence_year]
  export          <- unique(dt_sample, by = "ID")[, .N, .(duration_year)][order(duration_year)]
  dt_sample       <- dt_sample[duration_year!=0 & duration_year!=10]
  nrow(dt_sample[!duplicated(ID)])  
  
}

if (violence_sample == TRUE & decompose_duration == TRUE){
  
  dt_sample       <- dt_sample[duration_year == year_before_report & duration_year !=0]
  dt_sample[, .N,. (duration_year)][order(duration_year)]
  nrow(dt_sample[!duplicated(ID)])  
  
}





###################################################
#              Modify Matching Criteria           #
###################################################

dt_sample         <- dt_sample[!(is.na(dvYr))][!(is.na(max_edu_year))][!(is.na(birth_year))]

sample_obs        <- nrow(dt_sample[!duplicated(ID)])
sample_obs 

gc()


dt_sample[, max_income := ifelse(max_pre_income>=276813,1,0)]
dt_sample[, .N, .(max_income)]

dt_sample[, .N, .(INFOUNIT)][order(N)]
dt_sample[INFOUNIT == "D"     , infounit := 1]
dt_sample[INFOUNIT == "C"     , infounit := 2]
dt_sample[INFOUNIT == "I"     , infounit := 3]
dt_sample[is.na(infounit) == 1, infounit := 4]
dt_sample[, .N, .(infounit)][order(infounit)]

dt_sample[, .N, .(ISOPENCASE)]
dt_sample[, isopen := ifelse(ISOPENCASE=="Y" & is.na(ISOPENCASE)==0,1,0)]
dt_sample[, .N, .(isopen)]

dt_sample[, .N, .(max_edu_year)][order(max_edu_year)]

dt_sample[max_edu_year <= 9, edu_group := 1]
dt_sample[max_edu_year ==12, edu_group := 2]
dt_sample[max_edu_year >=14, edu_group := 3]

dt_sample[, .N, .(perp_edu)][order(perp_edu)]

dt_sample[perp_edu     <= 9, perp_edu_group := 0]
dt_sample[perp_edu     >=12, perp_edu_group := 1]
dt_sample[is.na(perp_edu)  , perp_edu_group := 2]

dt_sample[, .N, .(perp_edu_group)][order(perp_edu_group)]

dt_sample[, .N, .(perp_max_pre_income)][order(perp_max_pre_income)]

median_income     <- unique(dt_sample[is.na(perp_max_pre_income)==0,.(ACTIONID,perp_max_pre_income)], by = "ACTIONID")[, median := median(perp_max_pre_income)]
median_value      <- median_income$median %>% unique() %>% as.numeric()

dt_sample[perp_max_pre_income     <= median_value, perp_income_group := 0]
dt_sample[perp_max_pre_income     >  median_value, perp_income_group := 1]
dt_sample[is.na(perp_max_pre_income)             , perp_income_group := 2]

unique(dt_sample, by = "ID")[, .N, .(perp_income_group)][order(perp_income_group)]

dt_sample[,     `:=` (child_1_age = event_year - child_1_birth_year,
                      child_2_age = event_year - child_2_birth_year,
                      child_3_age = event_year - child_3_birth_year,
                      child_4_age = event_year - child_4_birth_year,
                      child_5_age = event_year - child_5_birth_year)]

dt_sample[, num_children_under_12 := rowSums(.SD < 12, na.rm = TRUE), .SDcols = paste0("child_", 1:5, "_age")]



#City Code
dt_sample[, .N, .(OWNERCITYCODE)][order(OWNERCITYCODE)]
dt_sample[OWNERCITYCODE=="6300000000"                            , city := 1]
dt_sample[OWNERCITYCODE=="1000100000"|OWNERCITYCODE=="6500000000", city := 2]
dt_sample[OWNERCITYCODE=="1000300000"                            , city := 3]
dt_sample[OWNERCITYCODE=="1001800000"                            , city := 4]
dt_sample[OWNERCITYCODE=="1000400000"                            , city := 5]
dt_sample[OWNERCITYCODE=="1000500000"                            , city := 6]
dt_sample[OWNERCITYCODE=="1000600000"|OWNERCITYCODE=="1001900000", city := 7]
dt_sample[OWNERCITYCODE=="1000700000"                            , city := 8]
dt_sample[OWNERCITYCODE=="1000900000"                            , city := 9]
dt_sample[OWNERCITYCODE=="1001000000"|OWNERCITYCODE=="1002000000", city :=10]
dt_sample[OWNERCITYCODE=="1002100000"|OWNERCITYCODE=="1001100000", city :=11]
dt_sample[OWNERCITYCODE=="6400000000"                            , city :=12]
dt_sample[OWNERCITYCODE=="1001300000"                            , city :=14]
dt_sample[OWNERCITYCODE=="1001400000"                            , city :=15]
dt_sample[OWNERCITYCODE=="1001500000"                            , city :=16]
dt_sample[OWNERCITYCODE=="1000200000"                            , city :=17]
dt_sample[OWNERCITYCODE=="1001600000"                            , city :=18]
dt_sample[OWNERCITYCODE=="900700000"                             , city :=19]
dt_sample[OWNERCITYCODE=="902000000"                             , city :=20]
dt_sample[OWNERCITYCODE=="1001700000"                            , city :=21]
dt_sample[OWNERCITYCODE=="1000800000"                            , city :=22]

dt_sample[, .N, .(city)][order(city)]



sample_obs        <- nrow(dt_sample[!duplicated(ID)])
sample_obs

unique(dt_sample, by = "ID")[, .N, .(infounit,city)][order(city,infounit)]
dt_sample[, cluster_level := paste(infounit,city)]



###################################################
#                  Stack Event Data               #
###################################################

cat("begin creating event data at \n", date.time(), '\n')

if ( stratify_or_not == TRUE){
  eventdata <- create_event_data(dt_sample,
                                 timevar                  = "calendarYr",
                                 unitvar                  = "ID",
                                 cohortvar                = "dvYr",
                                 onset_agevar             = "onset_time",
                                 covariate_base_balance   = control_variable,
                                 covariate_base_stratify  = stratify_variable,
                                 never_treat_action       = "exclude")
} else{
  eventdata <- create_event_data(dt_sample,
                                 timevar                  = "calendarYr",
                                 unitvar                  = "ID",
                                 cohortvar                = "dvYr",
                                 onset_agevar             = "onset_time",
                                 covariate_base_balance   = control_variable,
                                 never_treat_action       = "exclude",
                                 stratify_by_cohort       = T) 
  
}


cat("finish creating event data at \n", date.time(), '\n')
gc()

# create a blank data.table for recording outputs
results           <- data.table()

#We first back up the data
orig              <- copy(eventdata)






###################################################
#                      Caption                    #
###################################################

#We have get_divorced, year_full_dummy, year_give_birth, depression_dummy,
# FTWORKING, TRUE_AMT, year_full_dummy2, all_dot, physical_dot, mental_dot
outcome_variable  <- c("depression_dummy")
rm(eventdata);gc()
eventdata         <- orig





###################################################
#                 Dynamic Regression              #
###################################################

eventdata         <- construct_event_variables(eventdata)
dt_dynamic        <- data.table()
gc()


dt_dynamic        <- get_result_dynamic(eventdata, outcome_variable, trends = F)
dt_dynamic[, model := NULL]
dt_dynamic[, model := 5]

gc()






###################################################
#                       Figure                    #
###################################################

if ( violence_sample == TRUE & violence_effect == FALSE & decompose_duration == FALSE){
  dt_dynamic_pic  <- report(dt_dynamic) 
}else if (violence_sample == TRUE & violence_effect == TRUE & decompose_duration == FALSE){
  dt_dynamic_pic  <- violence_pooled(dt_dynamic)
}else if (violence_sample == TRUE & violence_effect == TRUE & decompose_duration == TRUE){
  dt_dynamic_pic  <- violence_duration(dt_dynamic)  
}else if (violence_sample == FALSE & violence_effect == FALSE & decompose_duration == FALSE){
  dt_dynamic_pic  <- report(dt_dynamic)  
}else if ( violence_sample == TRUE & violence_effect == FALSE & decompose_duration == TRUE){
  dt_dynamic_pic  <- report_duration(dt_dynamic)  
}


dt_dynamic_pic


cat("this event study is with",sample_obs,"sample","\n")



fwrite(dt_dynamic,file = paste0(output.path,outcome_variable,sample_selection,
                                sample_obs,violence_effect, ".csv"))


gc()









if ( static_reg == TRUE ){
  

###################################################
#                 Static Regression               #
###################################################


if ( stratify_or_not == FALSE ){
  
  stacked_mean      <- get_result_means(eventdata,outcome_variable,trends=T)
  stacked_mean[, model := NULL]
  dt_static         <- get_result_pooled(eventdata,outcome_variable,trends=T)
  dt_static         <- dt_static[variable=="treated_pre_stratify1.1"|variable=="treated_post_stratify1.1"]
  dt_static[, model := NULL]
  
  dt_static         <- cbind(dt_static, outcome_variable, sample_obs, violence_sample, violence_effect, stratify_or_not)
  dt_static         <- cbind(dt_static, stacked_mean)
  results           <- rbind(results, dt_static)
  
  fwrite(results,file = paste0(output.path,sample_selection,
                               sample_obs,violence_effect, "_static.csv"))
  
  gc()
  
}



if ( stratify_or_not == TRUE ){
  
  stacked_mean      <- get_result_means(eventdata,outcome_variable,trends=T)
  stacked_mean[, model := NULL]
  dt_static         <- get_result_pooled(eventdata,outcome_variable,trends=T)
  dt_static         <- dt_static[variable=="treated_pre_stratify1.0"  |
                                   variable=="treated_post_stratify1.0" |
                                   variable=="treated_pre_stratify1.1"  |
                                   variable=="treated_post_stratify1.1" |
                                   variable=="treated_pre_stratify1.2"  |
                                   variable=="treated_post_stratify1.2" | 
                                   variable=="treated_pre_stratify1.3"  |
                                   variable=="treated_post_stratify1.3" |
                                   variable=="treated_pre_stratify1.4"  |
                                   variable=="treated_post_stratify1.4" |
                                   variable=="treated_pre_stratify1.5"  |
                                   variable=="treated_post_stratify1.5" ]
  dt_static[, model := NULL]
  
  dt_static         <- cbind(dt_static, outcome_variable, sample_obs, violence_sample, violence_effect, stratify_or_not)
  dt_static         <- cbind(dt_static, stacked_mean)
  results           <- rbind(results, dt_static)
  
  fwrite(results,file = paste0(output.path,sample_selection,
                               sample_obs,violence_effect, "_age_static.csv"))
  
  gc()
  
}

}








